In this script, we investigate the behavioral pattern of the cell phase scores for the 5 different cell cycle phases for the single cell seq data collected by Yoav’s lab.
For more information on cell cycle definition, go to github.com/singleCell-seq/cell-cycle
setwd('/Users/kushal/Documents/singleCell-method/project/analysis/')
library(data.table)
library(maptpx)
library(gplots)
library(philentropy) # github: HajkD/philentropy
library(dplyr)
library(edgeR)
library(qtlcharts)
library(CountClust)
Import raw count data.
reads <- data.frame(fread('../data/reads.txt'), row.names=1)
molecules <- data.frame(fread('../data/molecules.txt'), row.names=1)
Quality single cell index.
quality_single_cells <- scan("../data/quality-single-cells.txt",
what = "character")
anno <- data.frame(fread('../data/annotation.txt'))
Keep only quality single cells. Remove bulk gene expression data.
molecules <- molecules[, grepl("bulk", colnames(molecules)) |
colnames(molecules) %in% quality_single_cells]
anno <- anno[anno$well == "bulk" | anno$sample_id %in% quality_single_cells, ]
stopifnot(ncol(molecules) == nrow(anno),
colnames(molecules) == anno$sample_id)
reads <- reads[, grepl("bulk", colnames(reads)) |
colnames(reads) %in% quality_single_cells]
stopifnot(ncol(reads) == nrow(anno),
colnames(reads) == anno$sample_id)
Exclude genes with a sum of 0 count across quality single cells.
expressed <- rowSums(molecules[, anno$well == "bulk"]) > 0 &
rowSums(molecules[, anno$well != "bulk"]) > 0
molecules <- molecules[expressed, ]
expressed <- rowSums(reads[, anno$well == "bulk"]) > 0 &
rowSums(reads[, anno$well != "bulk"]) > 0
reads <- reads[expressed, ]
molecules_single <- molecules %>% select(-contains("bulk"))
reads_single <- reads %>% select(-contains("bulk"))
Remove genes with max molecule numer larger than 1024
molecules_single <- molecules_single[apply(molecules_single,1,max) < 1024,];
Now we draw a list of marker genes that have cell cycle information.
cell_cycle_genes <- read.table("../data/cellcyclegenes.txt", header = TRUE, sep="\t")
## create 5 lists of 5 phases (de-level and then remove "")
cell_cycle_genes_list <- lapply(1:5,function(x){
temp <- as.character(cell_cycle_genes[,x])
temp[temp!=""]
})
labs <- unique(unlist(lapply(1:5, function(k)
match(cell_cycle_genes_list[[k]],
rownames(molecules_single)))) )
labs <- labs[!is.na(labs)]
molecules_single_cell_cycle <- molecules_single[labs,]
cpm_data <- voom(t(molecules_single[labs,]))$E
individual_id <- sapply(1:length(rownames(cpm_data)), function(x)
strsplit(rownames(cpm_data)[x],"[.]")[[1]][1])
batch_id <- sapply(1:length(rownames(cpm_data)), function(x)
strsplit(rownames(cpm_data)[x],"[.]")[[1]][2]);
individual.batch.id <- paste0(individual_id, "_", batch_id);
molecules_single <- t(BatchCorrectedCounts(t(molecules_single),individual.batch.id,use_parallel=TRUE));
reads_single <- t(BatchCorrectedCounts(t(reads_single),individual.batch.id,use_parallel=TRUE));
molecules_single_cell_cycle <- molecules_single[labs,];
We explore these genes in greater detail. How do the iplotCurves look like for the single cells for the cell cylce genes.
# Helper function for plotting curves
iplotCurves_cell_cycle <- function(data,genes_list)
{
## data is assumed to be a genes-by-samples matrix, same as molecules_single
labs <- match(genes_list, rownames(data));
temp_data <- data[labs,];
iplotCurves(temp_data);
}
Using the reordering technique of Macosko et al to order the cells as per their position in the cell cycle.
batch_removed_cpm <- voom(t(molecules_single_cell_cycle))$E;
ordered_cells <- as.vector(as.matrix(read.table("../data/ipsc_ordered_cells.txt")));
indices <- match(ordered_cells, colnames(molecules_single_cell_cycle));
batch_removed_cpm <- batch_removed_cpm[indices,];
sample gene cell cycle genes corresponding to first phase.
iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[1]])
## Set screen size to height=700 x width=1000
The iplotCurves for cell cycle genes corresponding to second phase.
iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[2]])
The iplotCurves for cell cycle genes corresponding to third phase.
iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[3]])
The iplotCurves for cell cycle genes corresponding to fourth phase.
iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[4]])
The iplotCurves for cell cycle genes corresponding to fifth phase.
iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[5]])
Not all the cell cycle genes seem to be informative. So, we pick the cell cycle genes that get picked up by the admixture method to be cluster driving. We try to see the patterns of expression of these cluster driving genes and see if they are indeed sinusoidal.
ipsc_topics_cellcycle_batchcorrect <- get(load("../../project/rdas/topic_fit_ipsc_batchcorrect_cellcycle.rda"));
topics_clust <- ipsc_topics_cellcycle_batchcorrect[[2]]
clust <- topics_clust$K
theta <- topics_clust$theta
features <- ExtractTopFeatures(theta,top_features=50,method="poisson")
features_vec <- unique(as.vector(features));
class <- as.numeric(apply(theta[features_vec,], 1, which.max))
imp_gene_names <- colnames(batch_removed_cpm[,features_vec]);
iplotCurves_cell_cycle(t(batch_removed_cpm),imp_gene_names)
We compute cell cycle scores for the cell cycle genes for every single cell.
ans <- sapply(cell_cycle_genes_list,function(xx){
#### create table of each phase
reads_single_phase <- reads_single[rownames(reads_single) %in% unlist(xx) ,]
#### add average expression of all genes in the phase
combined_matrix <- rbind(reads_single_phase,
average=apply(reads_single_phase,2,mean))
#### use transpose to compute cor matrix
cor_matrix <- cor(t(combined_matrix))
#### take the numbers
cor_vector <- cor_matrix[,dim(cor_matrix)[1]]
#### restrict to correlation >= 0.3
reads_single_phase_restricted <- reads_single_phase[rownames(reads_single_phase) %in% names(cor_vector[cor_vector >= 0.3]),]
#### apply normalization to reads
norm_factors_single <- calcNormFactors(reads_single_phase_restricted, method = "TMM")
reads_single_cpm <- cpm(reads_single_phase_restricted, log = TRUE,
lib.size = colSums(reads_single) * norm_factors_single)
#### output the phase specific scores (mean of normalized expression levels in the phase)
apply(reads_single_cpm,2,mean)
})
We make a iplotCurves plot for the cell phase scores of the 578 single cells.
iplotCurves(ans)
Now we apply a 2-step normalization on the phase scores.
#### normalization function
flexible_normalization <- function(data_in,by_row=TRUE){
if(by_row){
row_mean <- apply(data_in,1,mean)
row_sd <- apply(data_in,1,sd)
output <- data_in
for(i in 1:dim(data_in)[1]){
output[i,] <- (data_in[i,] - row_mean[i])/row_sd[i]
}
}
#### if by column
if(!by_row){
col_mean <- apply(data_in,2,mean)
col_sd <- apply(data_in,2,sd)
output <- data_in
for(i in 1:dim(data_in)[2]){
output[,i] <- (data_in[,i] - col_mean[i])/col_sd[i]
}
}
output
}
#### apply the normalization function
## first normalized for each phase
ans_normed <- flexible_normalization(ans,by_row=FALSE)
## then normalized of each cell
ans_normed_normed <- flexible_normalization(ans_normed,by_row=TRUE)
The iplotCurves plot after the 2-way normalization.
iplotCurves(ans_normed_normed)
Assign the single cells to the cell phase
cell_phase <- apply(ans_normed_normed,1,function(x) colnames(cell_cycle_genes)[which.max(x)])
assign_cell_phase <- data.frame(cell_phase)
cell_phase_vector <- as.vector(as.matrix(assign_cell_phase));
cell_phase_vector <- factor(cell_phase_vector,
levels = c("G1.S", "S", "G2.M", "M", "M.G1"))